SETUP
# RESULTS for report
# 27.03.2021
# Load packages
library(summarytools)
## Registered S3 method overwritten by 'pryr':
## method from
## print.bytes Rcpp
## For best results, restart R session and update pander using devtools:: or remotes::install_github('rapporter/pander')
library(knitr)
library(kableExtra)
library(sjPlot)
library(sjstats)
library(reshape2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(questionr)
##
## Attaching package: 'questionr'
## The following object is masked from 'package:sjstats':
##
## prop
## The following object is masked from 'package:summarytools':
##
## freq
# load utility functions
source("./R/utilities.R")
## Load data and specify inclusion/exclusion
# read data 2004-2018
hse_combined = read.csv("./data/HSE_combined/hse_combined.csv")
# recode imd
hse_combined$imd <- as.factor(hse_combined$imd)
levels(hse_combined$imd) = c(
"Least deprived",
"Less deprived",
"Median deprived",
"More deprieved",
"Most deprived"
)
# EXCLUDE: < 16 years
hse_combined <- hse_combined[hse_combined$age5 >= 16,]
DATA SET DESCRIPTION
CLICK TO EXPAND
## Data set description
# Data rows: overall
nrow(hse_combined)
## [1] 149596
# Data rows: by year
kbl <- by(hse_combined,hse_combined$year, nrow)
kbl <- do.call(rbind, list(kbl))
kbl <- kable(x = kbl, format = "simple", format.args = list(big.mark = ","))
kbl
| 14,836 |
6,704 |
10,303 |
14,142 |
15,098 |
8,420 |
8,610 |
8,290 |
8,077 |
7,997 |
8,178 |
# MISSING eq5d % by year
kbl <- by(hse_combined$eq5d,hse_combined$year, function(x){
round( sum(is.na(x))/length(x) , 4) * 100
} )
kbl <- do.call(cbind, list(kbl))
kable(x = kbl, format = "simple",col.names = c("% missing"))
| 2003 |
7.30 |
| 2004 |
8.80 |
| 2005 |
10.60 |
| 2006 |
8.60 |
| 2008 |
6.52 |
| 2010 |
12.92 |
| 2011 |
12.69 |
| 2012 |
12.01 |
| 2014 |
12.28 |
| 2017 |
10.35 |
| 2018 |
11.42 |
# NON-MISSING: rows with complete data
round(sum(complete.cases(hse_combined))/nrow(hse_combined),4)*100
## [1] 34.66
# MISSING data %; by variable
kbl <- apply(hse_combined,2,function(x){
round( sum(is.na(x))/nrow(hse_combined) , 4) * 100
} )
kbl
## year age imd sex wt eq5d
## 26.03 36.84 26.03 26.03 26.03 33.32
## mo sc ua pd ad hh_id
## 32.14 32.38 32.30 32.16 32.42 36.84
## dis_infect dis_cancer dis_endo dis_blood dis_mental dis_nerv
## 26.06 26.06 26.06 26.06 26.06 26.06
## dis_eye dis_ear dis_circ dis_resp dis_digest dis_genito
## 26.06 26.06 26.06 26.06 26.06 26.06
## dis_skin dis_musculo limitill cigst1 bmival topqual3
## 26.06 26.06 26.07 26.55 37.84 26.33
## econact eqv5 acutill genhelf2 ghq12scr age5.bin
## 37.05 43.99 26.08 26.06 42.95 26.03
## age5
## 26.03
## Exclude cases without eq5d for now
# EXCLUDE: EQ5D missing
hse_combined <- hse_combined[!is.na(hse_combined$eq5d),]
# Data rows: overall
nrow(hse_combined)
## [1] 99758
## Descriptives: age5
# distribution: age5 by IMD
ggplot(hse_combined) +
geom_histogram(aes(age5), col = "lightgray",fill = "cadetblue") +
facet_wrap(~imd) +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# distribution: age5 by IMD for age5 >= 85
ggplot(hse_combined[hse_combined$age5>75,]) +
geom_histogram(aes(age5), col = "lightgray",fill = "cadetblue",binwidth = 1) +
facet_wrap(~imd) +
theme_minimal()

## Descriptives: by variable
print(dfSummary(
hse_combined,
plain.ascii = FALSE,
style = "grid",
graph.magnif = 0.75,
valid.col = FALSE,
tmp.img.dir = "/tmp"
),
method = 'render')
TABLES 1+2 and FIGURES 1 + 2: Mean EQ-5D scores
# --------------------------------------------------------
## Descriptive stats: regression model
# > `eq5d = age5 x imd + year`
# age5, imd, and year are treated as factors
imd_lvls <- levels(hse_combined$imd)
uniq_age5 <- unique(hse_combined$age5)
uniq_age5 <- uniq_age5[order(uniq_age5)]
uniq_age5_labels <- c("16-17?","18-29?",paste0(uniq_age5[-c(1,2,17)],"-",uniq_age5[-c(1,2,17)]+4),"90+")
# descriptive regression model - FEMALES
descr_lm_f <- lm(eq5d ~ as.factor(age5) * imd + as.factor(year),hse_combined[hse_combined$sex =="Female",], weights = wt)
# descriptive regression model - ALES
descr_lm_m <- lm(eq5d ~ as.factor(age5) * imd + as.factor(year),hse_combined[hse_combined$sex =="Male",], weights = wt)
# BUT WHICH YEAR SHOULD WE USE TO PREDICT MEANS?
# SIDENOTE: Year doesnt really matter
# 2017 and 2018 are signficantly lower - maybe because of 5L?
year_coef_indices <- grepl("year",names(descr_lm_f$coefficients))
cbind(
descr_lm_f$coefficients[year_coef_indices],
confint(descr_lm_f)[year_coef_indices,]
)
## 2.5 % 97.5 %
## as.factor(year)2004 -0.0044214970 -0.011476447 0.002633453
## as.factor(year)2005 -0.0022866283 -0.013619189 0.009045932
## as.factor(year)2006 0.0013514399 -0.008051916 0.010754795
## as.factor(year)2008 -0.0062498539 -0.015473651 0.002973944
## as.factor(year)2010 -0.0069759007 -0.018092429 0.004140627
## as.factor(year)2011 -0.0323490885 -0.043375510 -0.021322666
## as.factor(year)2012 -0.0041028705 -0.015230639 0.007024898
## as.factor(year)2014 0.0008294763 -0.010373537 0.012032490
## as.factor(year)2017 -0.0246095991 -0.035806140 -0.013413058
## as.factor(year)2018 -0.0306793571 -0.041840440 -0.019518274
# REGRESSION MODEL WITHOUT YEAR AS PREDICTOR
# descriptive regression model - FEMALES
descr_lm_f <- lm(eq5d ~ as.factor(age5) * imd ,hse_combined[hse_combined$sex =="Female",], weights = wt)
# descriptive regression model - MALES
descr_lm_m <- lm(eq5d ~ as.factor(age5) * imd ,hse_combined[hse_combined$sex =="Male",], weights = wt)
# predict means by strata
# QUESTION: WHICH YEAR SHOULD WE USE TO PREDICT MEANS?
# OR BETTER USE EMPIRICAL AGGREGATE MEANS?
pred_df_f <- pred_df_m <- data.frame(age5 = rep(uniq_age5,5),imd = rep(imd_lvls,each = length(uniq_age5)))
pred_df_f_ratio <- pred_df_m_ratio <- data.frame(age5 = rep(uniq_age5,2),imd = rep(imd_lvls[c(1,5)],each = length(uniq_age5)))
# FIGURE 1
eq5d_pred_m <- predict(descr_lm_m, newdata = pred_df_m,interval ="predict")
## Warning in predict.lm(descr_lm_m, newdata = pred_df_m, interval = "predict"): Assuming constant prediction variance even though model fit is weighted
eq5d_pred_m_plot <- cbind(pred_df_m, eq5d_pred_m)
figure1 <- ggplot(eq5d_pred_m_plot) +
# geom_ribbon(aes(x=age5, ymin = lwr, ymax = upr, fill = imd), alpha = 0.3) +
geom_point(aes(x=age5, y = fit, col = imd)) +
geom_line(aes(x=age5, y = fit, col = imd)) +
ylim(c(0,1)) +
ylab("Mean EQ-5D index") +
xlab("Age group") +
scale_x_continuous(name = "Age group", breaks = uniq_age5, labels = uniq_age5_labels) +
ggtitle("Pooled mean EQ-5D Score by IMD quintile and age - MALE") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 30),
legend.position = "top")
figure1

ggsave(plot = figure1, filename = "./outputs/figure1.jpg",height = 5, width = 7)
# TABLE 1
eq5d_pred_m <- formatC(eq5d_pred_m,digits = 2,format = "f")
eq5d_pred_m <- paste0(eq5d_pred_m[,1]," (",eq5d_pred_m[,2],"; ",eq5d_pred_m[,3],")")
pred_df_m$eq5d <- eq5d_pred_m
# the inequality ratio ci are bootstrapped
ineq_m <- ineq_boot1(hse_combined, "Male",boot_iter = 1000)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following object is masked from 'package:ggplot2':
##
## :=
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
ineq_m_formated <- formatC(as.matrix(ineq_m$plot_df),digits = 2,format = "f")
ineq_m_formated <- paste0(ineq_m_formated[,2]," (",ineq_m_formated[,3],"; ",ineq_m_formated[,4],")")
pred_df_m <- reshape(pred_df_m,direction = "wide",timevar = "imd" ,idvar = "age5")
pred_df_m <- cbind(pred_df_m, "most/least ratio" = ineq_m_formated)
pred_df_m
write.csv(pred_df_m, "./outputs/table1.csv", row.names = F)
# FIGURE 2
eq5d_pred_f <- predict(descr_lm_f, newdata = pred_df_f,interval ="predict")
## Warning in predict.lm(descr_lm_f, newdata = pred_df_f, interval = "predict"): Assuming constant prediction variance even though model fit is weighted
eq5d_pred_f_plot <- cbind(pred_df_f, eq5d_pred_f)
figure2 <- ggplot(eq5d_pred_f_plot) +
# geom_ribbon(aes(x=age5, ymin = lwr, ymax = upr, fill = imd), alpha = 0.3) +
geom_point(aes(x=age5, y = fit, col = imd)) +
geom_line(aes(x=age5, y = fit, col = imd)) +
ylim(c(0,1)) +
ylab("Mean EQ-5D index") +
xlab("Age group") +
scale_x_continuous(name = "Age group", breaks = uniq_age5, labels = uniq_age5_labels) +
ggtitle("Pooled mean EQ-5D Score by IMD quintile and age - FEMALE") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 30),
legend.position = "top")
figure2

ggsave(plot = figure2, filename = "./outputs/figure2.jpg",height = 5, width = 7)
# TABLE 2
eq5d_pred_f <- formatC(eq5d_pred_f,digits = 2,format = "f")
eq5d_pred_f <- paste0(eq5d_pred_f[,1]," (",eq5d_pred_f[,2],"; ",eq5d_pred_f[,3],")")
pred_df_f$eq5d <- eq5d_pred_f
# bootstrap ineq ratio
ineq_f <- ineq_boot1(hse_combined, "Female",boot_iter = 1000)
ineq_f_formated <- formatC(as.matrix(ineq_f$plot_df),digits = 2,format = "f")
ineq_f_formated <- paste0(ineq_f_formated[,2]," (",ineq_f_formated[,3],"; ",ineq_f_formated[,4],")")
pred_df_f <- reshape(pred_df_f,direction = "wide",timevar = "imd" ,idvar = "age5")
pred_df_f <- cbind(pred_df_f, "most/least ratio" = ineq_f_formated)
pred_df_f
write.csv(pred_df_f, "./outputs/table2.csv", row.names = F)
SIDENOTE: Any difference in variance between male and female?
# Yes, females have about 7% higher SD
eq5d_sd_sex_tot <- aggregate(eq5d ~ sex, hse_combined, sd)
(eq5d_sd_sex_tot$eq5d[1] / eq5d_sd_sex_tot$eq5d[2] - 1) * 100
## [1] 7.038993
eq5d_sd_sex <- aggregate(eq5d ~ sex + age5, hse_combined, sd)
eq5d_sd_sex <- reshape(eq5d_sd_sex,direction = "wide",timevar = "sex" ,idvar = "age5")
eq5d_sd_sex$female_sd_proz <- ( (eq5d_sd_sex$eq5d.Female / eq5d_sd_sex$eq5d.Male) - 1) * 100
eq5d_sd_sex
FIGURES 9 + 10: Proportion reporting each level of response for each EQ-5D dimension
# FIGURE 9
# ONLY FOR 3L DIMENSIONS 2003-2014 !
# EXCLUDING 2017 + 2018
# source("./R/utilities.R")
dim_agg1 <- dimGetter(hse_combined[hse_combined$year <= 2014,], 1)
dim_agg2 <- dimGetter(hse_combined[hse_combined$year <= 2014,], 2)
dim_agg3 <- dimGetter(hse_combined[hse_combined$year <= 2014,], 3)
dim_aggs <- merge(dim_agg1,dim_agg2, all.x=T, all.y = T)
dim_aggs <- merge(dim_aggs,dim_agg3, all.x=T, all.y = T)
dim_aggs_long = melt(dim_aggs,id.vars = c("imd","sex","age5","level"))
dim_aggs_long_rel <- dim_aggs_long[grepl("rel",dim_aggs_long$variable),]
# dim_aggs_long_rel$value[is.na(dim_aggs_long_rel$value)] <- 0
dim_aggs_long_rel$dimension = substr(dim_aggs_long_rel$variable,1,2)
dim_aggs_long_rel$dimension = factor(dim_aggs_long_rel$dimension,
levels=c("mo","sc","ua","pd","ad")
)
dim_aggs_long_rel$dimension <- recode(
dim_aggs_long_rel$dimension,
mo = "Mobility",
sc = "Self-care",
ua = "Usual activities",
pd = "Pain/Discomfort",
ad = "Anxiety/Depression"
)
col_labs <- c("No problems","Some problems","Extreme problems")
figure9 <- ggplot(dim_aggs_long_rel[dim_aggs_long_rel$sex=="Male",],
aes(x=as.factor(age5), y=value,fill=as.factor(level),
col=as.factor(level))) +
geom_col(position = 'stack',alpha=0.6) +
facet_grid(dimension ~ imd) +
scale_color_manual(values = c("lightgreen","orange","red"),
labels = c(col_labs),name="Level") +
scale_fill_manual(values = c("lightgreen","orange","red"),
labels = c(col_labs),name="Level") +
xlab("Age group") +
ylab("Proportion") +
theme_minimal()+
theme(legend.position = "bottom") +
ggtitle("Proportion reporting each level of response for each EQ-5D dimension over age - Males") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45),
legend.position = "top")
figure9
## Warning: Removed 2663 rows containing missing values (position_stack).

ggsave(plot = figure9, filename = "./outputs/figure9.jpg",height = 8, width = 9)
## Warning: Removed 2663 rows containing missing values (position_stack).
figure10 <- ggplot(dim_aggs_long_rel[dim_aggs_long_rel$sex=="Female",],
aes(x=as.factor(age5), y=value,fill=as.factor(level),
col=as.factor(level))) +
geom_col(position = 'stack',alpha=0.6) +
facet_grid(dimension ~ imd) +
scale_color_manual(values = c("lightgreen","orange","red"),
labels = c(col_labs),name="Level") +
scale_fill_manual(values = c("lightgreen","orange","red"),
labels = c(col_labs),name="Level") +
xlab("Age group") +
ylab("Proportion") +
theme_minimal()+
theme(legend.position = "bottom") +
ggtitle("Proportion reporting each level of response for each EQ-5D dimension over age - Females") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45),
legend.position = "top")
figure10
## Warning: Removed 2646 rows containing missing values (position_stack).

ggsave(plot = figure10, filename = "./outputs/figure10.jpg",height = 8, width = 9)
## Warning: Removed 2646 rows containing missing values (position_stack).
FIGURES 11 - 16: IMD1/IMD5 ratio of % reporting problems for each EQ-5D dimension
# ------------------------------------------------------------------------
#### Line graph showing ratio of IMD1/IMD5 for each EQ-5D dimension over age
# FIGURE 11
figure11 <- dimProbPlotter(dim_aggs_long_rel,1,label = "No Problems","Male")
figure11

ggsave(plot = figure11, filename = "./outputs/figure11.jpg",height = 8, width = 9)
# FIGURE 12
figure12 <- dimProbPlotter(dim_aggs_long_rel,1,label = "No Problems","Female")
figure12

ggsave(plot = figure12, filename = "./outputs/figure12.jpg",height = 8, width = 9)
# FIGURE 13
figure13 <- dimProbPlotter(dim_aggs_long_rel,2,label = "Some Problems","Male")
figure13

ggsave(plot = figure13, filename = "./outputs/figure13.jpg",height = 8, width = 9)
# FIGURE 14
figure14 <- dimProbPlotter(dim_aggs_long_rel,2,label = "Some Problems","Female")
figure14

ggsave(plot = figure14, filename = "./outputs/figure14.jpg",height = 8, width = 9)
# FIGURE 15
figure15 <- dimProbPlotter(dim_aggs_long_rel,3,label = "Extreme Problems","Male")
figure15
## Warning: Removed 6 row(s) containing missing values (geom_path).

ggsave(plot = figure15, filename = "./outputs/figure15.jpg",height = 8, width = 9)
## Warning: Removed 6 row(s) containing missing values (geom_path).
# FIGURE 16
figure16 <- dimProbPlotter(dim_aggs_long_rel,3,label = "Extreme Problems","Female")
figure16
## Warning: Removed 6 row(s) containing missing values (geom_path).

ggsave(plot = figure16, filename = "./outputs/figure16.jpg",height = 8, width = 9)
## Warning: Removed 6 row(s) containing missing values (geom_path).
fin.